\[~\]
\[~\]
We are provided with some preprocessed data coming from Autism Brain Image Data Exchange Project. This dataset is partitioned in two subsets: the Autism Spectrum Disorder (ASD) and the Typically Developed (TD) sets.
\[~\]
\[~\]
As shown in the image above, each subset contains the records of 12 patients. A sample of 145 records is associated to each person. Formally, this sample can be treated as an IID sample defined as:
\[ D^{(i)}_n = \{X_1, ..., X_n\} \sim f_X(\cdot) \hspace{1em} \text{ for a certain } f_X \text{, having } n=145 \] Each \(X_i\) is defined as a random vector \(X_i = [X_i(1), ..., X_i(D)]^T \in \mathbb{R}^D\), where \(D=116\) is the number of the features, i.e the number of the Regions of Interest (ROI) of the person’s brain. We also assume that each patient within a group can be himself treated as an IID sample from a certain \(f_D(\cdot)\). Thus, moving bottom-up in the data tree, each group formally corresponds to:
\[ D_n = \{D^{(1)}_n, D^{(2)}_n, ..., D^{(12)}_n\} \] It is worth to note that the IID assumptions at the two levels of the data tree are quite crucial. Thanks to them, we have different ways of pooling the data:
At this point, we decide to follow the second approach, computing the covariance matrix \(\hat{\Sigma}^{(i)}\) associated to each \(D^{(i)}_n\), \(i=1,...,12\), and taking the mean of them; by the IID property of \(\{D^{(1)}_n, ..., D^{(12)}_n\}\), we have that \(\big\{\hat{\Sigma}^{(i)}\big\}_{i=1}^{12}\) is still IID, being function of each \(D_n\). This choice has many advantages, which will be pointed out further in the discussion.
\[~\]
\[~\]
After loading the data provided by the professor, create the covariance matrix for each patient; Z-scale this matrix through the Fisher’s Z-transform and after that create a unique matrix per group considering the olympic mean by \(X_n = (x_1, ... , x_{12})\) for each element in the matrix.
\[~\]
# define a function in order to catch each person and return the correlation matrix between ROIs
get.cor.ROIs <- function(...){
args <- list(...)
if(!is.null(args[["person"]])) {
args$frame <- args[["group"]][[args[["person"]]]]
}
n <- ncol(args[["frame"]])
nms <- names(args[["frame"]])
# takes efficiently the combinations, this is useful for large dataset!
V1 <- rep(nms[1:(n-1)], seq(from=n-1, to = 1, by = -1))
V2 <- unlist(lapply(1:(n-1), function(i)(nms[(i+1):n])))
corr.ROIs <- data.frame(ROI_1=V1, ROI_2=V2) # create the correlation in which I will insert the values
corr.ROIs$z.pearson.corr <- apply(corr.ROIs, 1, function(row) {
atanh(cor(args[["frame"]][row["ROI_1"]], args[["frame"]][row["ROI_2"]])) # takes the sets of columns; apply the Z-transform to the correlation matrix
})
return(corr.ROIs)
}
\[~\]
Now, for each group create for each corresponding patient the matrix of correlations in the Z-scale.
\[~\]
# create the matrix correlations for all patients
for(person in names(asd_sel))
assign(paste("z.trans.", person, sep=""), get.cor.ROIs(group = asd_sel, person = person))
for(person in names(td_sel))
assign(paste("z.trans.", person, sep=""), get.cor.ROIs(group = td_sel, person = person))
\[~\]
After that, we pooled the data considering the olympic mean between these datasets for each group, this is due to …. ecccccc
\[~\]
# create a unique matrix for each group
unique.matrix <- function(group) {
mtx <- z.trans.caltech_0051472[c("ROI_1", "ROI_2")] # create matrix with combinations
mtx$cor.mean <- apply(mtx, 1, function(row) {
values <- c()
for(person in names(group)) {
frame <- get(paste("z.trans.", person, sep="")) # match the address of the string with the real variable
elem <- frame[(frame[["ROI_1"]] == row["ROI_1"]) & (frame[["ROI_2"]] == row["ROI_2"]), "z.pearson.corr"] # select the correlation
values <- c(values, elem)
}
#values <- values[values != min(values)]; values <- values[values != max(values)] # remove the min and max, we have continuous values, so the prob. to have twice the min or max is impossible..
mean(values) # take the mean!
})
return(mtx)
}
\[~\]
Now, we call and create the matrix for each group of people as in this way:
\[~\]
# call the creation of unique matrix
asd_sel.dt <- unique.matrix(asd_sel); head(asd_sel.dt)
## ROI_1 ROI_2 cor.mean
## 1 2001 2002 0.51188381
## 2 2001 2101 0.02486222
## 3 2001 2102 -0.21203874
## 4 2001 2111 -0.05246567
## 5 2001 2112 -0.19052649
## 6 2001 2201 0.14638295
td_sel.dt <- unique.matrix(td_sel); head(td_sel.dt)
## ROI_1 ROI_2 cor.mean
## 1 2001 2002 0.416005628
## 2 2001 2101 0.086652336
## 3 2001 2102 -0.334109327
## 4 2001 2111 0.007217533
## 5 2001 2112 -0.150770771
## 6 2001 2201 0.164410598
\[~\]
\[~\]
library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
halfHeatmap <- function(x) {
# Give extreme colors:
xx <- x[order(x$cor.mean, decreasing = TRUE),][1:100,]
return(ggplot(xx, aes(ROI_1, ROI_2, fill = cor.mean)) +
ggtitle("Top 100 correlations") +
geom_tile() +
scale_fill_gradient(low="pink", high="darkorchid4") +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(hjust = 0.5)) +
labs(legend = "Correlation"))
}
halfHeatmap(asd_sel.dt)
\[~\]
halfHeatmap(td_sel.dt)
\[~\]
hist(asd_sel.dt$cor.mean, probability = T, breaks = 50, col = "orange", main = "Histogram of ASD group", xlab = "olympic mean", ylab = "count", border = "white")
\[~\]
hist(td_sel.dt$cor.mean, probability = T, breaks = 50, col = "steelblue", main = "Histogram of TD group", xlab = "olympic mean", ylab = "count", border = "white")
\[~\]
hist(rbind(asd_sel.dt[["cor.mean"]], td_sel.dt[["cor.mean"]]), probability = T, breaks = 50, col = "purple", main = "Histogram of ASD+TD group", xlab = "olympic mean", ylab = "count", border = "white")
\[~\]
\[~\]
Start to find the quantile value for these two groups. Since working on the Z-scale, the threshold is also in the Z-scale.
\[~\]
z.t <- apply(rbind(asd_sel.dt["cor.mean"], td_sel.dt["cor.mean"]), 2, quantile, probs=0.8)
paste("the threshold at the 80th percentile for the subjects is: ", round(as.numeric(z.t), 3), sep = "")
## [1] "the threshold at the 80th percentile for the subjects is: 0,125"
\[~\]
We compute the confidence interval for \(\bar{Z}_{12}(j,k)\). Remember that we started computing the estimate of the covariance matrix for each person in each of the two groups, having:
\[ \Big\{\hat{\rho}^{(i)}_{j,k}\Big\}_{i=1}^{12}, \text{ having } j,k \in \{1,...,116\}, \text{ } j\neq k \]
Then, we applied the Fisher’s Z-transform:
\[ \Big\{\hat{Z}^{(i)}_{j,k}\Big\}_{i=1}^{12} = \Big\{atan\left(\hat{\rho}^{(i)}_{j,k}\right)\Big\}_{i=1}^{12} \]
Finally, we computed the estimator:
\[ \overline{Z}_{12}(j,k) = \frac{1}{12}\sum_{i=1}^{12}\hat{Z}^{(i)}_{j,k} \text{ ,} \]
\[~\]
remembering that \(\hat{Z}^{(i)}_{j,k}\) are \(IID\), being function of \(\hat{\rho}^{(i)}_{j,k}\), respectively.
Now, we calculate the confidence interval for each \(\overline{Z}_{12}(j,k)\). Starting from:
\[~\]
\[ \frac{\overline{Z}_{12}(j,k) - z_{j,k}}{\sigma_{j,k} / \sqrt{12}}, \\ \text{ where } \sigma_{j,k} = \frac{1}{\sqrt{145 - 3}} \]
and applying the Bonferroni’s correction:
\[ \frac{\alpha}{m}, \text{ where } m = \begin{pmatrix} D \\ 2 \end{pmatrix}, \text{ } D=116 \]
\[~\]
we end up with:
\[~\]
\[ C_{12}^{Z(j,k)}\Big(\frac{\alpha}{m}\Big) = \bigg[\overline{Z}_{12}(j,k) \mp z_{\alpha/2m} \cdot \frac{\sigma_{j,k}}{\sqrt{12}}\bigg] \]
\[~\]
conf.int <- function(dt, m = 1, alpha = 0.05) { # m : Family-Wise Error Rate correction (Bonferroni)
# Asymptotic variance
n.samp <- 145 # number of IID samples on which we computed the correlation matrix
sigma <- 1 / sqrt(n.samp - 3) # standard deviation of the (j,k)-th Z-transformed Pearson coefficent
n.p <- 12 # number of people in each of the two groups
se <- sigma / sqrt(n.p) # standard error of the estimator Z12
# Confidence interval
cint <- sapply(dt$cor.mean, function(z.i) {
list(confidence_interval = c(lb = z.i - se * qnorm(1 - alpha / (2*m)),
ub = z.i + se * qnorm(1 - alpha / (2*m))))
})
return(cint)
}
\[~\]
Finally:
\[~\]
## Compute the confidence interval
m <- (116 * 115) / 2 # number of intervals
asd_sel.dt$cint <- conf.int(asd_sel.dt, m = m); asd_sel.dt$cint[1:3]
## $confidence_interval
## lb ub
## 0,4033775 0,6203901
##
## $confidence_interval
## lb ub
## -0,08364405 0,13336849
##
## $confidence_interval
## lb ub
## -0,3205450 -0,1035325
td_sel.dt$cint <- conf.int(td_sel.dt, m = m); td_sel.dt$cint[1:3]
## $confidence_interval
## lb ub
## 0,3074994 0,5245119
##
## $confidence_interval
## lb ub
## -0,02185393 0,19515860
##
## $confidence_interval
## lb ub
## -0,4426156 -0,2256031
\[~\]
Show an example of confidence interval between two ROIs:
\[~\]
plotCint <- function(ROI.1, ROI.2, dt) {
hist(dt$cor.mean, probability = T, breaks = 50, col = "pink", main = "Interval of a data point", xlab = "olympic mean", ylab = "count", border = "deeppink")
corresponding.interval <- function(ROI.1, ROI.2, group) {
cint <- group[(group$ROI_1 == ROI.1) & (group$ROI_2 == ROI.2), "cint"]
lb <- cint[[1]]["lb"]; ub <- cint[[1]]["ub"]
estimate.corr <- group[(group$ROI_1 == ROI.1) & (group$ROI_2 == ROI.2), "cor.mean"]
info <- list("lb" = lb, "ub" = ub, "cor.mean" = estimate.corr)
return(info)
}
cint <- corresponding.interval(ROI.1 = ROI.1, ROI.2 = ROI.2, dt)
rect(cint$lb, -10, cint$ub, +10, border = NA, col = viridis::viridis(1, .3))
points(cint$cor.mean, 0, col = "green", pch = 19)
abline(v = cint$cor.mean, lwd = 3, col = viridis::viridis(1, .1))
text(cint$cor.mean, 2, expression(hat(p)[olymp]), pos = 4)
}
plotCint(ROI.1 = "2001", ROI.2 = "2002", asd_sel.dt)
\[~\]
plotCint(ROI.1 = "2001", ROI.2 = "2002", td_sel.dt)
\[~\]
We are now ready to estimate the adjacency matrix of the association graphs:
\[~\]
## Estimate the adjacency matrix given the Z-transformed Pearson correlation coefficient
get.est.adj.mat <- function(dt, dec.rule, t) {
# create the adj matrix
nameVals <- sort(unique(unlist(dt[1:2]))) # set up storage matrix, get names for row and columns
# construct 0 matrix of correct dimensions with row and column names
adj_mat <- matrix(0, length(nameVals), length(nameVals), dimnames = list(nameVals, nameVals))
# fill in the matrix with matrix indexing on row and column names
adj_mat[as.matrix(dt[c("ROI_1", "ROI_2")])] <- 0
# put edge according to the decision rule
# TODO: SUBSTITUTE THE FOR LOOP WITH A MORE EFFICIENT SOLUTION
for(idx in rownames(dt)){
if( dec.rule(dt, idx, t) ) {
adj_mat[as.character(dt[idx, "ROI_1"]), as.character(dt[idx, "ROI_2"])] <- 1
}
}
return(adj_mat)
}
## check if the two intervals int1 and int2 are overlapping
are.joint <- function(int1, int2) return((int1[1] <= int2[2]) && (int2[1] <= int1[2]))
## check the simple threshold condition
check.threshold <- function(dt, idx, t) {
val <- abs(dt[idx, "cor.mean"])
return(val >= t)
}
## check the confidence interval condition
check.conf.int <- function(dt, idx, t) {
c.int <- dt[idx, "cint"]$confidence_interval
t.int <- c(-t, t)
return(!are.joint(c.int, t.int))
}
\[~\]
The association graph taking into account the confidence interval with the Bonferroni’s correction is:
\[~\]
# Adjacency matrix of the estimated graph with the Bonferroni's correction
adj_mat_asd_bonf <- get.est.adj.mat(asd_sel.dt, check.conf.int, z.t)
adj_mat_td_bonf <- get.est.adj.mat(td_sel.dt, check.conf.int, z.t)
# Estimated graph
G.asd.bonf <- graph_from_adjacency_matrix(adj_mat_asd_bonf, mode = "undirected")
G.td.bonf <- graph_from_adjacency_matrix(adj_mat_td_bonf, mode = "undirected")
\[~\]
Plot the graphs with the threshold, with Bonferroni’s multiplicity adjustment:
\[~\]
# To plot the correlation graph(s)
plot(G.asd.bonf, vertex.size = 4, edge.width = .3, vertex.color = "orchid", label.col = "black",
main = "ASD Association Graph", sub = "(Bonferroni's multiplicity adjustment)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)
plot(G.td.bonf, vertex.size = 4, edge.width = .3, vertex.color = "dodgerblue", label.col = "black",
main = "TD Association Graph", sub = "(Bonferroni's multiplicity adjustment)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)
\[~\]
The association graph taking into only the threshold is:
\[~\]
# Adjacency matrix of the estimated graph with the Bonferroni's correction
adj_mat_asd_thr <- get.est.adj.mat(asd_sel.dt, check.threshold, z.t)
adj_mat_td_thr <- get.est.adj.mat(td_sel.dt, check.threshold, z.t)
# Estimated graph
G.asd.thr <- graph_from_adjacency_matrix(adj_mat_asd_thr, mode = "undirected")
G.td.thr <- graph_from_adjacency_matrix(adj_mat_td_thr, mode = "undirected")
\[~\]
Plot the graphs with the threshold, without adjustment:
\[~\]
plot(G.asd.thr, vertex.size = 4, edge.width = .3, vertex.color = "orchid", label.col = "black",
main = "ASD Association Graph", sub = "(Threshold only)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)
plot(G.td.thr, vertex.size = 4, edge.width = .3, vertex.color = "dodgerblue", label.col = "black",
main = "TD Association Graph", sub = "(Threshold only)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)
\[~\]
The association graph taking into account the confidence interval without the Bonferroni’s correction is:
\[~\]
# Recompute the confidence intervals without the multiplicity adjustment
asd_sel.dt$cint <- conf.int(asd_sel.dt, m = 1)
td_sel.dt$cint <- conf.int(td_sel.dt, m = 1)
# Adjacency matrix of the estimated graph with the Bonferroni's correction
adj_mat_asd_no_bonf <- get.est.adj.mat(asd_sel.dt, check.conf.int, z.t)
adj_mat_td_no_bonf <- get.est.adj.mat(td_sel.dt, check.conf.int, z.t)
# Estimated graph
G.asd.no.bonf <- graph_from_adjacency_matrix(adj_mat_asd_no_bonf, mode = "undirected")
G.td.no.bonf <- graph_from_adjacency_matrix(adj_mat_td_no_bonf, mode = "undirected")
\[~\]
Plot the graphs without Bonferroni’s multiplicity adjustment:
\[~\]
plot(G.asd.no.bonf, vertex.size = 4, edge.width = .3, vertex.color = "orchid", label.col = "black",
main = "ASD Association Graph", sub = "(Without Bonferroni's multiplicity adjustment)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)
plot(G.td.no.bonf, vertex.size = 4, edge.width = .3, vertex.color = "dodgerblue", label.col = "black",
main = "TD Association Graph", sub = "(Without Bonferroni's multiplicity adjustment)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)
\[~\]
Let’s check the number of edges of the estimated ASD graph:
\[~\]
paste("Number of edges considering only the threshold t: ", gsize(G.asd.thr))
## [1] "Number of edges considering only the threshold t: 2869"
paste("Number of edges with Bonferroni's correction: ", gsize(G.asd.bonf))
## [1] "Number of edges with Bonferroni's correction: 1073"
paste("Number of edges without Bonferroni's correction: ", gsize(G.asd.no.bonf))
## [1] "Number of edges without Bonferroni's correction: 1893"
\[~\]
\[~\]
Define the difference graph in order to create the boostrap samples, the difference graph is based on the difference between \[ \Big\|\triangle_{j,k}\Big\| = \Big\|{\rho}^{ASD}_{j,k} - {\rho}^{TD}_{j,k} \Big\|, \text{ having } j,k \in \{1,...,116\}, \text{ } j\neq k \]
\[~\]
# Considering the two main datasets for these two groups, obtain the difference graph
olymp.diff <- abs(asd_sel.dt[, 3] - td_sel.dt[, 3]) # olympic mean
diff.dt <- asd_sel.dt[, 1:2]
diff.dt$cor.mean <- olymp.diff
\[~\]
The threshold is considered as the 80% percentile as we saw before:
z.diff.t <- apply(diff.dt["cor.mean"], 2, quantile, probs=0.8)
paste("the threshold at the 80th percentile for the subjects is: ", round(as.numeric(z.diff.t), 3), sep = "")
## [1] "the threshold at the 80th percentile for the subjects is: 0,121"
\[~\]
Try to create the adjacency matrix considering the same threshold chose before:
# Adjacency matrix of the estimated graph with only the threshold
adj_mat_diff_thr <- get.est.adj.mat(diff.dt, check.threshold, z.diff.t)
# Estimated graph
G.diff.thr <- graph_from_adjacency_matrix(adj_mat_diff_thr, mode = "undirected")
\[~\]
Plot the graphs with the threshold, without adjustment:
plot(G.diff.thr, vertex.size = 4, edge.width = .3, vertex.color = "green", label.col = "black",
main = "Association Difference Graph", sub = "(Threshold only)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)
\[~\]
Create the confidence interval with the boostrap procedure:
## Concatenate the patients into one patient
concat.data <- function(dataset) {
out <- dataset[[1]]
for(i in 2:length(dataset)) out <- rbind(out, dataset[[i]])
return(out)
}
## Non-parametric bootstrap
non.parametric.bootstrap <- function(data1, data2, B = 1000) {
p.samples <- abs(atan(cor(data1)) - atan(cor(data2)))
n <- ncol(p.samples)
p.boot <- array(NA, dim = c(B, n, n))
for(b in 1:B) {
idx <- sample(1:n, n, replace = TRUE)
p.boot[b,,] <- abs(atan(cor(data1[idx,])) - atan(cor(data2[idx,])))
}
return(list(p.samples = p.samples, p.boot = p.boot))
}
\[~\]
As hinted in the “Problem statement” section, since we assume the patients in each group as IID, we concatenate their measurments into a unique “patient”:
\[~\]
asd_fusion <- concat.data(asd_sel)
td_fusion <- concat.data(td_sel)
\[~\]
\[~\]
out.boot <- non.parametric.bootstrap(asd_fusion, td_fusion, B = 1000)
# Normal Bootstrap CI
alpha <- 0.05 # 95%
m <- (116 * 115) / 2
mean.boot <- apply(out.boot$p.boot, 2:3, mean)
se.boot <- apply(out.boot$p.boot, 2:3, sd)
feature.names <- colnames(asd_fusion)
n <- length(feature.names)
V1 <- rep(feature.names[1:(n-1)], seq(from=n-1, to = 1, by = -1))
V2 <- unlist(lapply(1:(n-1), function(i)(feature.names[(i+1):n])))
diff.dt <- data.frame(list(ROI_1 = V1, ROI_2 = V2))
diff.dt$mean.boot <- as.vector(mean.boot[upper.tri(mean.boot, diag = F)])
diff.dt$se.boot <- as.vector(se.boot[upper.tri(se.boot, diag = F)])
# diff.dt <- diff.dt[!is.na(diff.dt$mean.boot),]
diff.dt$cint <- sapply(1:nrow(diff.dt), function(idx) {
list(confidence_interval = c(lb = diff.dt$mean.boot[idx] - diff.dt$se.boot[idx] * qnorm(1 - alpha / (2*m)),
ub = diff.dt$mean.boot[idx] + diff.dt$se.boot[idx] * qnorm(1 - alpha / (2*m))))
})
\[~\]
# Adjacency matrix of the estimated graph with only the threshold
adj_mat_diff_boot <- get.est.adj.mat(diff.dt, check.conf.int, z.diff.t)
# Estimated graph
G.diff.boot.bonf <- graph_from_adjacency_matrix(adj_mat_diff_boot, mode = "undirected")
plot(G.diff.boot.bonf, vertex.size = 4, edge.width = .3, vertex.color = "skyblue", label.col = "black",
main = "Association Difference Graph", sub = "(Bonferroni's multiplicity adjustment)",
layout = layout.kamada.kawai, vertex.label.cex = 0.7, vertex.label.dist = .8)